Hospital Stay Analysis

EDA

Import Data

Column Descriptions

  • ID: Unique identifier.
  • Lgth.of.Sty: Length of stay (Variable of interest).
  • Age: Average Age of the patient.
  • Inf.Risk: Infection risk.
    • Average estimated probability of hospital infection
  • R.Cul.Rat: Culture rate.
    • Ratio of # of cultures performed to number of patients without symptoms of infection times 100
  • R.CX.ray.Rat: Chest X-ray rate.
    • Ratio of # of cultures performed to number of patients without symptoms of pneumonia time 100
  • N.Beds: Number of beds in the hospital.
    • Average number of beds.
  • Med.Sc.Aff: Medical school affiliation.
    • 1=Yes, 2=No
  • Region: Region of the hospital.
    • Geographic region: 1=NE, 2=NC,3=S, 4=W
  • Avg.Pat: Average number of patients.
    • Average number of patients in hospital per day.
  • Avg.Nur: Average number of nurses.
    • Average number of full time nurses.
  • Pct.Ser.Fac: Percentage of service facilities.
    • Percent of 35 potential facilities and services that are provided by the hospital
hospital_data <- read.csv("./project_hospitals.csv", header = TRUE)
# View the first few rows of the dataset
head(hospital_data)
# View column names (features)
colnames(hospital_data)
##  [1] "ID"           "Lgth.of.Sty"  "Age"          "Inf.Risk"     "R.Cul.Rat"    "R.CX.ray.Rat" "N.Beds"      
##  [8] "Med.Sc.Aff"   "Region"       "Avg.Pat"      "Avg.Nur"      "Pct.Ser.Fac"

Sanity check

Summary of the Data Set

# Summary statistics of all variables
summary(hospital_data)
##        ID       Lgth.of.Sty          Age           Inf.Risk       R.Cul.Rat      R.CX.ray.Rat        N.Beds     
##  Min.   :  1   Min.   : 6.700   Min.   :38.80   Min.   :1.300   Min.   : 1.60   Min.   : 39.60   Min.   : 29.0  
##  1st Qu.: 29   1st Qu.: 8.340   1st Qu.:50.90   1st Qu.:3.700   1st Qu.: 8.40   1st Qu.: 69.50   1st Qu.:106.0  
##  Median : 57   Median : 9.420   Median :53.20   Median :4.400   Median :14.10   Median : 82.30   Median :186.0  
##  Mean   : 57   Mean   : 9.648   Mean   :53.23   Mean   :4.355   Mean   :15.79   Mean   : 81.63   Mean   :252.2  
##  3rd Qu.: 85   3rd Qu.:10.470   3rd Qu.:56.20   3rd Qu.:5.200   3rd Qu.:20.30   3rd Qu.: 94.10   3rd Qu.:312.0  
##  Max.   :113   Max.   :19.560   Max.   :65.90   Max.   :7.800   Max.   :60.50   Max.   :133.50   Max.   :835.0  
##    Med.Sc.Aff       Region         Avg.Pat         Avg.Nur       Pct.Ser.Fac   
##  Min.   :1.00   Min.   :1.000   Min.   : 20.0   Min.   : 14.0   Min.   : 5.70  
##  1st Qu.:2.00   1st Qu.:2.000   1st Qu.: 68.0   1st Qu.: 66.0   1st Qu.:31.40  
##  Median :2.00   Median :2.000   Median :143.0   Median :132.0   Median :42.90  
##  Mean   :1.85   Mean   :2.363   Mean   :191.4   Mean   :173.2   Mean   :43.16  
##  3rd Qu.:2.00   3rd Qu.:3.000   3rd Qu.:252.0   3rd Qu.:218.0   3rd Qu.:54.30  
##  Max.   :2.00   Max.   :4.000   Max.   :791.0   Max.   :656.0   Max.   :80.00

Finding empty values

colSums(is.na(hospital_data))
##           ID  Lgth.of.Sty          Age     Inf.Risk    R.Cul.Rat R.CX.ray.Rat       N.Beds   Med.Sc.Aff       Region 
##            0            0            0            0            0            0            0            0            0 
##      Avg.Pat      Avg.Nur  Pct.Ser.Fac 
##            0            0            0
Plot of missing data
# Visualizing Missing Values with Better Formatting
vis_miss(hospital_data) +
  labs(title = "Visualizing Missing Data",
       x = "",
       y = "") +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 8, face = "bold"),
    plot.subtitle = element_text(size = 8),
    axis.text.x = element_text(angle = 90, hjust = 1)
  )

Our data set hospitalData from the csv HospitalDurations contains 113 observations of 12 variables. We have no missing values for any variable, thus we do not need to impute for any observations. Out of the 12 variables, one of them is an identifier variable ID, and will not be used for predicting. The patient’s length of stay Lgth.of.Sty is the variable we would like to predict for. Therefor, we have 10 potential explanatory variables and 1 variable we are trying to predict for.

Visualizing Hospital Stay Distribution

ggplot(hospital_data, aes(x = Lgth.of.Sty, fill = factor(1))) +
  geom_histogram(binwidth = 2, color = "black") +
  labs(x = "Length of Stay (Days)", y = "Count", title = "Distribution of Length of Stay") +
  scale_fill_manual(values = base_palette[3]) +
  guides(fill = "none")

We see some slight right skew for Lgt.of.Sty

Data Cleaning

base_data <- hospital_data |> select(-ID)
#base_data$RegionRaw <- base_data$Region
base_data$Region <- as.factor(base_data$Region) # convert Region from int to factor
base_data$Med.Sc.Aff <- as.factor(base_data$Med.Sc.Aff) # convert Med.Sc.Aff from int to factor

base_data$Region <- revalue(base_data$Region, c("1" = "NE", "2" = "NC", "3" = "S", "4" = "W"))
base_data$Med.Sc.Aff <- revalue(base_data$Med.Sc.Aff, c("1" = "Yes", "2" = "No"))

# labels for graphs
base_data_labels <- c(
  "Length of Stay",
  "Age",
  "Infection Risk",
  "Routine culturing ratio",
  "Routine chest X-ray ratio",
  "Number of beds",
  "Medical School Affiliation",
  "Region",
  "Average Daily census",
  "Number of nurses",
  "Available facilities"
)

Summary of the cleaned data

summary(base_data)
##   Lgth.of.Sty          Age           Inf.Risk       R.Cul.Rat      R.CX.ray.Rat        N.Beds      Med.Sc.Aff Region 
##  Min.   : 6.700   Min.   :38.80   Min.   :1.300   Min.   : 1.60   Min.   : 39.60   Min.   : 29.0   Yes:17     NE:28  
##  1st Qu.: 8.340   1st Qu.:50.90   1st Qu.:3.700   1st Qu.: 8.40   1st Qu.: 69.50   1st Qu.:106.0   No :96     NC:32  
##  Median : 9.420   Median :53.20   Median :4.400   Median :14.10   Median : 82.30   Median :186.0              S :37  
##  Mean   : 9.648   Mean   :53.23   Mean   :4.355   Mean   :15.79   Mean   : 81.63   Mean   :252.2              W :16  
##  3rd Qu.:10.470   3rd Qu.:56.20   3rd Qu.:5.200   3rd Qu.:20.30   3rd Qu.: 94.10   3rd Qu.:312.0                     
##  Max.   :19.560   Max.   :65.90   Max.   :7.800   Max.   :60.50   Max.   :133.50   Max.   :835.0                     
##     Avg.Pat         Avg.Nur       Pct.Ser.Fac   
##  Min.   : 20.0   Min.   : 14.0   Min.   : 5.70  
##  1st Qu.: 68.0   1st Qu.: 66.0   1st Qu.:31.40  
##  Median :143.0   Median :132.0   Median :42.90  
##  Mean   :191.4   Mean   :173.2   Mean   :43.16  
##  3rd Qu.:252.0   3rd Qu.:218.0   3rd Qu.:54.30  
##  Max.   :791.0   Max.   :656.0   Max.   :80.00

Initial Graphs

Correlation

cor_matrix = cor(base_data |> select(-Region, -Med.Sc.Aff))
ggcorrplot(cor_matrix, lab = TRUE)

Length of Stay correlated more strongly with: + Infection Risk, + Average Patients.

pairs(base_data |> select(-Region, -Med.Sc.Aff), panel = function(x, y) {
  points(x, y, pch = 16, col = "blue")
  abline(lm(y ~ x), col = "red")
})

ggpairs(base_data, lower = list(continuous = wrap("smooth", method = "lm", color = "blue")))

The variables Number of Beds, Average Patients, Average Nurses, and Percentage of service facilities; exhibit high intercorrelation, particularly between Number of Beds and Average Patients (0.981), which could introduce collinearity in a predictive model.

An important strong correlation that appears to be an important factor in hospital stays is Infection Risk (0.533).

Both Culture Rate and Chest X-Ray Rate displays moderate correlation with Infection Risk; thhis suggests an intuitive relationship: as infection risk increases, more diagnostic tests are performed. Infection Risk may also serve as a useful predictor for Length of Stay, as higher infection risks could lead to prolonged hospital stays.

Correlograms with Log Transformations

cor_data <- base_data %>% select(-Med.Sc.Aff, -Region)
lin_log_cor_data <- log(cor_data)
lin_log_cor_data$Lgth.of.Sty <- cor_data$Lgth.of.Sty
log_lincor_data <- cor_data %>% select(-Lgth.of.Sty)
log_lincor_data$logLgth.of.Sty <- log(cor_data$Lgth.of.Sty)

Linear-Linear Corr

correlation_matrix <- cor(cor_data, method = "pearson")
ggcorrplot(correlation_matrix, lab = TRUE, title = "Linear-Linear")

Linear-Log Corr

lin_log_correlation_matrix <- cor(lin_log_cor_data, method = "pearson")
ggcorrplot(lin_log_correlation_matrix, lab = TRUE, title = "Linear-Log")

Log-Linear Corr

log_lin_correlation_matrix <- cor(log_lincor_data, method = "pearson")
ggcorrplot(log_lin_correlation_matrix, lab = TRUE, title = "Log-Linear")

Length of Stay vs other continuous variables

long_data <- base_data %>%
  pivot_longer(cols = c(2:6, 9:11), names_to = "Variable", values_to = "Value")

ggplot(long_data, aes(x = Value, y = Lgth.of.Sty)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "loess") +
  facet_wrap(~ Variable, scales = "free_x") +
  theme_minimal()

Length of Stay vs Log of other continuous variables

long_data <- base_data %>%
  pivot_longer(cols = c(2:6, 9:11), names_to = "Variable", values_to = "Value")

ggplot(long_data, aes(x = log(Value), y = Lgth.of.Sty)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "loess") +
  facet_wrap(~ Variable, scales = "free_x") +
  theme_minimal()

Log Length of Stay vs other continuous variables

# Define color palette
plot_colors <- RColorBrewer::brewer.pal(n = 16, name = "Set1")

# Convert data to long format for faceting
long_data <- base_data %>%
  pivot_longer(cols = c(2:6, 9:11), names_to = "Variable", values_to = "Value") %>%
  filter(Value > 0, Lgth.of.Sty > 0)

# Create the faceted ggplot
ggplot(long_data, aes(x = log(Value), y = log(Lgth.of.Sty))) +
  geom_point(alpha = 0.6, aes(color = Variable)) +
  geom_smooth(method = "loess", se = FALSE, aes(color = Variable)) +
  scale_color_manual(values = plot_colors) +
  facet_wrap(~ Variable, scales = "free_x") +
  ggtitle("Log-Transformed Variables vs Log Length of Stay") +
  xlab("Log Predictor Variables") +
  ylab("Log Length of Stay (Days)") +
  theme_minimal() +
  theme(legend.position = "none")

Categorical vs Length of Stay

region_patient <- ggplot(data = base_data, aes(x = Region, y = Lgth.of.Sty, colour = Med.Sc.Aff)) +
  geom_boxplot() +
  ggtitle("Region vs Patient Length of Stay", "Grouped by Medical School Affiliation")

med_assoc <- ggplot(data = base_data, aes(x = Med.Sc.Aff, y = Lgth.of.Sty, colour = Region)) +
  geom_boxplot() +
  ggtitle("Med. School Aff. vs Pat, Length of Stay", "Grouped by Region")

grid.arrange(grobs = list(med_assoc, region_patient), ncol = 1)

By looking at the distributions (via the boxplots), we can see that there are some differences between a patient’s length of stay if we group by Region (or by Medical School Affiliation).

If grouping by Medical school affiliation: Length of stay between the regions is fairly similar on average if there is an affiliation for medical school, but the distributions are much more different between the regions. If there is no affiliation, then the distributions are a bit more similar, but the averages are different between each region.

If grouping by Region: Length of stay on average tends to be similar between the regions (being slightly less in the W and S regions) with NE having a wider distribution of length of stay. Interesting to note that medical school affiliation tends to have lower length of stay if the affiliation is none.

Continuous grouped by Categorical vs Length of Stay

plot_list <- list()
indexes <- c(2:6, 9:11)
for (i in seq_along(indexes)) {
  x_var <- names(base_data)[indexes[i]]

  temp_plot <- ggplot(data = base_data, aes_string(x = x_var, y = "Lgth.of.Sty", color = "Region")) +
    geom_point() +
    geom_smooth() +
    ggtitle(paste0(base_data_labels[i], " vs Length of Stay"), "Grouped by Region") +
    xlab(base_data_labels[i]) +
    ylab("Length of Stay (Days)") +
    theme(legend.position = "right")
  plot_list[[length(plot_list) + 1]] <- temp_plot
}

grid.arrange(grobs = plot_list, ncol = 2)

Grouped by Medical School Affiliation

plot_list <- list()
indexes <- c(2:6, 9:11)
for (i in seq_along(indexes)) {
  x_var <- names(base_data)[indexes[i]]

  temp_plot <- ggplot(data = base_data, aes_string(x = x_var, y = "Lgth.of.Sty", color = "Med.Sc.Aff")) +
    geom_point() +
    geom_smooth() +
    ggtitle(paste0(base_data_labels[i], " vs Length of Stay"), "Grouped by Medical School Affiliation") +
    xlab(base_data_labels[i]) +
    ylab("Length of Stay (Days)") +
    theme(legend.position = "right")
  plot_list[[length(plot_list) + 1]] <- temp_plot
}

grid.arrange(grobs = plot_list, ncol = 2)

Objective 1

all_vars_model = lm(Lgth.of.Sty ~ ., data = base_data)
par(mfrow = c(2,2))
plot(all_vars_model)

All Variables Model Sumary

summary(all_vars_model)
## 
## Call:
## lm(formula = Lgth.of.Sty ~ ., data = base_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3048 -0.6608 -0.0272  0.5862  6.3001 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.322292   1.782122   1.864 0.065222 .  
## Age           0.079922   0.028266   2.827 0.005668 ** 
## Inf.Risk      0.439665   0.127298   3.454 0.000812 ***
## R.Cul.Rat     0.005546   0.015982   0.347 0.729299    
## R.CX.ray.Rat  0.012688   0.007147   1.775 0.078892 .  
## N.Beds       -0.004851   0.003603  -1.346 0.181224    
## Med.Sc.AffNo -0.266644   0.441089  -0.605 0.546872    
## RegionNC     -0.812966   0.351406  -2.313 0.022744 *  
## RegionS      -1.158277   0.351704  -3.293 0.001370 ** 
## RegionW      -1.880560   0.444136  -4.234  5.1e-05 ***
## Avg.Pat       0.015182   0.004424   3.432 0.000872 ***
## Avg.Nur      -0.005891   0.002218  -2.656 0.009203 ** 
## Pct.Ser.Fac  -0.012179   0.013774  -0.884 0.378698    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.231 on 100 degrees of freedom
## Multiple R-squared:  0.6299, Adjusted R-squared:  0.5855 
## F-statistic: 14.18 on 12 and 100 DF,  p-value: < 2.2e-16

The diagnostic plots suggest the linear model fits reasonably well.

The Residuals vs. Fitted and Scale-Location plots show residuals scattered around zero, with no strong signs of nonlinearity or heteroskedasticity. However, Observation 47 stands out, showing a large residual across multiple plots. In the Q–Q plot, most points follow the regression line, but Observation 47 appears on the right tail, deviating from normality.

Leverage diagnostics indicate that most data points are well-standardized around the fitted line. Observation 47 is near the threshold for Cook’s distance, suggesting potential influence, while Observation 112 exhibits high leverage without strong influence. These points may warrant further investigation to understand their impact on the model.

No major assumption violations are evident, but examining these observations could help assess model robustness.

Variable Inflation

vif(all_vars_model)
##                   GVIF Df GVIF^(1/(2*Df))
## Age           1.176172  1        1.084515
## Inf.Risk      2.154694  1        1.467888
## R.Cul.Rat     1.978520  1        1.406599
## R.CX.ray.Rat  1.416265  1        1.190070
## N.Beds       35.699204  1        5.974881
## Med.Sc.Aff    1.855334  1        1.362107
## Region        1.715222  3        1.094091
## Avg.Pat      34.211423  1        5.849053
## Avg.Nur       7.055523  1        2.656224
## Pct.Ser.Fac   3.241812  1        1.800503

Number of Beds and Average Patients have extremely high VIFs, suggesting high multicollinearity. This makes sense, as the more room a hospital has (number of beds) then the more patients they can have on average. Average number of full time nurses has potential collinearity as well, which also makes sense as a hospital with more beds will likely have more nursing staff to take care of the patients.

Model Explorations

Linear Model

test_model <- Lgth.of.Sty ~ Age + Inf.Risk + R.Cul.Rat + R.CX.ray.Rat + Avg.Nur + Pct.Ser.Fac + Med.Sc.Aff + Region
test_model_fit <- lm(test_model, data = base_data)

par(mfrow = c(2,2))
plot(test_model_fit)

par(mfrow = c(1,1))

In comparing the first model (all variables model) and this model, the residuals vs. fitted plot now shows points more evenly scattered around the reference line, indicating fewer systematic patterns in the residuals and a flatter loess curve.

The Q–Q plot remains largely linear, though observations such as #47 and #112 still appear in the right tail, suggesting they continue to exert some influence.

The scale–location plot demonstrates that the spread of residuals has become more consistent across fitted values, with a flatter red line that signals more homoscedasticity than before.

The residuals vs. leverage plot, #47 no longer rises as close to the typical Cook’s distance threshold, while #112 has shifted from being primarily a high‐leverage point to one with a moderately large residual—though neither appears as influential as in the initial model.

Model Summary
summary(test_model_fit)
## 
## Call:
## lm(formula = test_model, data = base_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7517 -0.8456 -0.0709  0.5848  6.9770 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.025158   1.951126   2.063  0.04165 *  
## Age           0.078458   0.031248   2.511  0.01362 *  
## Inf.Risk      0.576846   0.136458   4.227 5.16e-05 ***
## R.Cul.Rat    -0.013363   0.017119  -0.781  0.43682    
## R.CX.ray.Rat  0.011462   0.007911   1.449  0.15045    
## Avg.Nur       0.001031   0.001621   0.636  0.52605    
## Pct.Ser.Fac  -0.003332   0.014373  -0.232  0.81716    
## Med.Sc.AffNo -0.970771   0.460747  -2.107  0.03757 *  
## RegionNC     -0.962513   0.374275  -2.572  0.01156 *  
## RegionS      -1.135900   0.376770  -3.015  0.00324 ** 
## RegionW      -2.511570   0.459006  -5.472 3.20e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.363 on 102 degrees of freedom
## Multiple R-squared:  0.5368, Adjusted R-squared:  0.4914 
## F-statistic: 11.82 on 10 and 102 DF,  p-value: 2.727e-13
vif(test_model_fit)
##                  GVIF Df GVIF^(1/(2*Df))
## Age          1.171453  1        1.082337
## Inf.Risk     2.017887  1        1.420523
## R.Cul.Rat    1.850070  1        1.360173
## R.CX.ray.Rat 1.414378  1        1.189276
## Avg.Nur      3.071224  1        1.752491
## Pct.Ser.Fac  2.877054  1        1.696188
## Med.Sc.Aff   1.649862  1        1.284470
## Region       1.381390  3        1.055325

Observation #47 and #112 high residuals

numeric_data <- base_data |> select(where(is.numeric))
stats <- numeric_data |>
  dplyr::summarise(
    dplyr::across(
      .cols = everything(),
      .fns = list(
        mean   = ~ mean(.x, na.rm = TRUE),
        median = ~ median(.x, na.rm = TRUE)
      )
    )
  ) |>
  pivot_longer(
    cols = everything(),
    names_to = c("variable", ".value"),
    names_pattern = "(.*)_(mean|median)"
  )

outliers_diff <- numeric_data[c(47, 112), ] |>
  as_tibble(rownames = "obs_id") %>%
  pivot_longer(
    cols = -obs_id,
    names_to = "variable",
    values_to = "value"
  ) |>
  left_join(stats, by = "variable") |>
  dplyr::mutate(
    diff_from_mean = value - mean,
    diff_from_median = value - median
  )

final_table <- outliers_diff |>
  pivot_wider(
    id_cols = c(variable, mean, median),
    names_from = obs_id,
    values_from = c(value, diff_from_mean, diff_from_median)
  ) |>
  dplyr::mutate(dplyr::across(where(is.numeric), ~ round(.x, 3))) |>
  dplyr::rename(
    "#47 Value" = value_47,
    "#47 Mean Distance" = diff_from_mean_47,
    "#47 Median Distance" = diff_from_median_47,
    "#112 Value" = value_112,
    "#112 Mean Distance" = diff_from_mean_112,
    "#112 Median Distance" = diff_from_median_112,
  ) |>
  dplyr::select(
    variable, mean, median,
    `#112 Value`, `#47 Value`, `#112 Mean Distance`, `#47 Mean Distance`, `#112 Median Distance`, `#47 Median Distance`,
  )

datatable(final_table, options = list(dom = "t", width = "100%", scrollX = TRUE))
Linear Model without outliers
test_model_remove <- Lgth.of.Sty ~ Age + Inf.Risk + R.Cul.Rat + R.CX.ray.Rat + Avg.Nur + Pct.Ser.Fac + Med.Sc.Aff + Region
test_model_remove_fit <- lm(test_model_remove, data = base_data[c(-112, -47), ])

par(mfrow = c(2,2))
plot(test_model_remove_fit)

par(mfrow = c(1,1))

After removing observations #47 and #112, the model’s residual diagnostics show notable improvements.

Residuals vs. Fitted plot indicates a reduction in heteroscedasticity, as the variance of residuals is now more evenly spread with less curvature in the trend.

The Q-Q plot aligns more closely with the normal distribution, suggesting an improvement in the normality assumption, as the extreme deviations from #47 and #112 are no longer distorting the distribution.

In the Scale-Location plot, variance appears more stable, reinforcing the improved homoscedasticity.

Finally, the Residuals vs. Leverage plot confirms that these observations had high leverage and strong influence on model coefficients, as their removal results in a more balanced distribution of leverage across data points.

summary(test_model_remove_fit)
## 
## Call:
## lm(formula = test_model_remove, data = base_data[c(-112, -47), 
##     ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.19321 -0.65320 -0.06784  0.51335  3.01070 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.228576   1.466415   3.566 0.000559 ***
## Age           0.054064   0.023485   2.302 0.023404 *  
## Inf.Risk      0.469248   0.102685   4.570 1.40e-05 ***
## R.Cul.Rat    -0.004346   0.012845  -0.338 0.735797    
## R.CX.ray.Rat  0.008063   0.005934   1.359 0.177264    
## Avg.Nur       0.001118   0.001213   0.922 0.358648    
## Pct.Ser.Fac  -0.002924   0.010752  -0.272 0.786261    
## Med.Sc.AffNo -0.689680   0.349635  -1.973 0.051306 .  
## RegionNC     -0.544629   0.283414  -1.922 0.057494 .  
## RegionS      -0.758340   0.284514  -2.665 0.008968 ** 
## RegionW      -2.069147   0.346224  -5.976 3.53e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.018 on 100 degrees of freedom
## Multiple R-squared:  0.567,  Adjusted R-squared:  0.5237 
## F-statistic: 13.09 on 10 and 100 DF,  p-value: 2.325e-14
vif(test_model_remove_fit)
##                  GVIF Df GVIF^(1/(2*Df))
## Age          1.158175  1        1.076185
## Inf.Risk     1.977350  1        1.406183
## R.Cul.Rat    1.850144  1        1.360200
## R.CX.ray.Rat 1.388417  1        1.178311
## Avg.Nur      3.005297  1        1.733579
## Pct.Ser.Fac  2.836364  1        1.684151
## Med.Sc.Aff   1.615906  1        1.271183
## Region       1.352387  3        1.051599

Performance comparision after removing outliers

rmse <- function(actual, predicted) {
  sqrt(mean((actual - predicted)^2))
}

predictions_before <- predict(test_model_fit, base_data)
rmse_lm_before <- rmse(predictions_before, base_data$Lgth.of.Sty)

base_data_filtered <- base_data[c(-112, -47), ]
predictions_after <- predict(test_model_remove_fit, base_data_filtered)
rmse_lm_after <- rmse(predictions_after, base_data_filtered$Lgth.of.Sty)

sprintf("RMSE without removing = %.3f -- RMSE without 112 and 47 = %.3f", rmse_lm_before, rmse_lm_after)
## [1] "RMSE without removing = 1.295 -- RMSE without 112 and 47 = 0.966"

The reduction in RMSE from 1.295 to 0.966 after removing observations 47 and 112 suggests that these points were contributing to higher model error. Their removal improves the model’s predictive accuracy, indicating that it generalizes better to new data and produces more reliable estimates

Additionally by removing highly correlated variables such as Number of Beds and Average Patients, we’ve substantially lowered variance inflation. This indicates that our changes have mitigated multicollinearity and improved the stability and interpretability of the model.

Model Interpretation

confint(test_model_remove_fit)
##                     2.5 %       97.5 %
## (Intercept)   2.319250967  8.137900348
## Age           0.007470692  0.100657665
## Inf.Risk      0.265523483  0.672973359
## R.Cul.Rat    -0.029830821  0.021138075
## R.CX.ray.Rat -0.003709698  0.019836041
## Avg.Nur      -0.001287584  0.003524206
## Pct.Ser.Fac  -0.024256032  0.018408909
## Med.Sc.AffNo -1.383346930  0.003986702
## RegionNC     -1.106914726  0.017657181
## RegionS      -1.322807364 -0.193872415
## RegionW      -2.756046275 -1.382247990

Our model Lgth.of.Sty ~ Age + Inf.Risk + R.Cul.Rat + R.CX.ray.Rat + Avg.Nur + Pct.Ser.Fac + Med.Sc.Aff + Region aims to predict Length of Stay (Lgth.of.Sty) based on key hospital and patient characteristics.

  • Model Performance
    • The model explains 52.3% of the variance in Length of Stay (Lgth.of.Sty) (Adjusted R² = 0.523).
    • RMSE = 0.966, meaning our predictions deviate by approximately 0.97 days from actual hospital stays.
  • Key Predictors
    • Infection Risk (Holding all other variables constant): A one-unit increase in Infection Risk is associated with an increase of approximated 0.69 days in Length of Stay (p < 0.001).
    • Region (West) (Holding all other variables constant): Patients in Western hospitals have an expected shorter stay of approximated 2.09 days compared to the Northeast (p < 0.001).
  • Confidence Intervals
    • Infection Risk 95% CI: (0.266, 0.673)
    • Region (West) 95% CI: (-2.756, -1.382)

Prepare data for section

interaction_data <- base_data[c(-112, -47), ]

Anova Model - Fit ANOVA model

anova_model <- lm(Lgth.of.Sty ~ ., data = interaction_data)
summary(anova_model)
## 
## Call:
## lm(formula = Lgth.of.Sty ~ ., data = interaction_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.07731 -0.57223 -0.08947  0.61897  3.06561 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.9450368  1.4707217   3.362 0.001103 ** 
## Age           0.0578446  0.0233165   2.481 0.014811 *  
## Inf.Risk      0.4226656  0.1053017   4.014 0.000117 ***
## R.Cul.Rat     0.0033790  0.0133374   0.253 0.800530    
## R.CX.ray.Rat  0.0086477  0.0058544   1.477 0.142845    
## N.Beds       -0.0008609  0.0031127  -0.277 0.782699    
## Med.Sc.AffNo -0.4888113  0.3610531  -1.354 0.178899    
## RegionNC     -0.5864174  0.2883010  -2.034 0.044649 *  
## RegionS      -0.8703365  0.2906440  -2.995 0.003480 ** 
## RegionW      -1.8920505  0.3657131  -5.174 1.22e-06 ***
## Avg.Pat       0.0057002  0.0043733   1.303 0.195489    
## Avg.Nur      -0.0023495  0.0019917  -1.180 0.241002    
## Pct.Ser.Fac  -0.0095188  0.0113323  -0.840 0.402968    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.003 on 98 degrees of freedom
## Multiple R-squared:  0.5878, Adjusted R-squared:  0.5373 
## F-statistic: 11.64 on 12 and 98 DF,  p-value: 3.301e-14

ANOVA test for significance of interaction terms

anova_results <- anova(anova_model)

Checking Variance Inflation Factor (VIF) for multicollinearity

vif(anova_model)
##                   GVIF Df GVIF^(1/(2*Df))
## Age           1.175247  1        1.084088
## Inf.Risk      2.140653  1        1.463097
## R.Cul.Rat     2.053431  1        1.432980
## R.CX.ray.Rat  1.391222  1        1.179501
## N.Beds       36.769932  1        6.063822
## Med.Sc.Aff    1.773930  1        1.331890
## Region        1.771611  3        1.100005
## Avg.Pat      43.286785  1        6.579269
## Avg.Nur       8.346041  1        2.888952
## Pct.Ser.Fac   3.243355  1        1.800932

High VIF values: N.Beds (36.77), Avg.Pat (43.28), and Avg.Nur (8.34). Suggesting multicollinearity is a concern for these variables.

mse <- anova_results["Residuals", "Mean Sq"]
rmse_anova <- sqrt(mse)
sprintf("Anova RMSE %.3f", rmse_anova)
## [1] "Anova RMSE 1.003"

Interpretations:

The ANOVA model indicates that significant predictors of Length of Stay (Lgth.of.Sty) include Age (p = 0.0148), where a 1 year increase is associated with a 0.0578-day increase, and Infection Risk (p = 0.0001), significantly increasing the length of stay. Regional differences also play a role, with patients in the NC region staying 0.586 days less (p = 0.0446), those in the South staying 0.870 days less (p = 0.0034), and those in the West experiencing the greatest reduction of 1.892 days (p < 0.001). On the other hand, several predictors, including R.Cul.Rat, R.CX.ray.Rat, N.Beds, Med.Sc.AffNo, Avg.Pat, Avg.Nur, and Pct.Ser.Fac, do not significantly impact the length of stay (p > 0.05), suggesting that they may not be strong predictors and may need to be removed from the model.

LASSO Model

# Prepare data: Separate predictors (X) and target variable (y)
X <- model.matrix(Lgth.of.Sty ~ . - 1, data = interaction_data)
y <- interaction_data$Lgth.of.Sty

# Perform LASSO regression with cross-validation
cv_lasso <- cv.glmnet(X, y, alpha = 1, nfolds = 10)

# Plot cross-validation results
plot(cv_lasso)

Optimal \(\lambda\)

# Find the optimal lambda (penalty parameter)
best_lambda <- cv_lasso$lambda.min
sprintf("Optimal λ: %.5f", best_lambda)
## [1] "Optimal λ: 0.00702"

LASSO Fit

# Fit final LASSO model using optimal lambda
lasso_model <- glmnet(X, y, alpha = 1, lambda = best_lambda)
# Extract feature coefficients
lasso_coefficients <- coef(lasso_model)

# Convert coefficients to a named vector
lasso_coefficients <- as.vector(lasso_coefficients)
names(lasso_coefficients) <- rownames(coef(lasso_model))

# Print Intercept
intercept <- lasso_coefficients["(Intercept)"]
sprintf("Intercept: %.3f", intercept)
## [1] "Intercept: 4.518"

Features and Coefficients Selected

# Print selected features and their coefficients
selected_features <- lasso_coefficients[lasso_coefficients != 0]
lasso_table <- tibble(
  Feature = names(selected_features),
  Coefficient = round(selected_features, 3)  # Round to 3 decimals for readability
)

lasso_table
# Compute RMSE using cross-validation
predictions <- predict(lasso_model, X)
rmse_lasso <- sqrt(mean((y - predictions)^2))

# Compute R-squared
ss_total <- sum((y - mean(y))^2)
ss_residual <- sum((y - predictions)^2)
r_squared <- 1 - (ss_residual / ss_total)

# Compute Adjusted R-squared
n <- length(y)
p <- length(selected_features) - 1
adj_r_squared <- 1 - ((1 - r_squared) * (n - 1) / (n - p - 1))

sprintf("RMSE %.3f | Adjusted R-squared: %.3f with optimal λ: %.5f", rmse_lasso, adj_r_squared, best_lambda)
## [1] "RMSE 0.944 | Adjusted R-squared: 0.540 with optimal λ: 0.00702"

Objective 2

Prepare data

After analyzing outliers we are going ahead to work without both outliers (observations 47 and 112)

cleaned_obj_2 <- base_data[c(-112, -47), ]

KNN Model - Non-Paramatric

KNN All Variables

set.seed(1234)
train_control <- trainControl(method = "cv", number = 5)

knn_all_model <- train(
  Lgth.of.Sty ~ .,
  data = cleaned_obj_2,
  method = "knn",
  trControl = train_control,
  tuneGrid = expand.grid(k = c(1:10, 20, 30))
)

print(knn_all_model)
## k-Nearest Neighbors 
## 
## 111 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 88, 90, 88, 89, 89 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    1  1.636425  0.1570248  1.341632
##    2  1.422562  0.1693380  1.120702
##    3  1.433125  0.1277156  1.120258
##    4  1.361358  0.1735495  1.052855
##    5  1.340433  0.1906822  1.066184
##    6  1.338491  0.1898658  1.066725
##    7  1.308330  0.2283080  1.063999
##    8  1.309400  0.2201435  1.045493
##    9  1.306700  0.2295606  1.036747
##   10  1.325767  0.2110012  1.047278
##   20  1.314614  0.2099153  1.041218
##   30  1.291736  0.2333301  1.027280
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 30.
KNN Tune Plot
plot(knn_all_model)

KNN Best Tune
knn_results <- knn_all_model$results
best_k <- knn_all_model$bestTune$k
rmse_knn_all <- knn_results$RMSE[knn_results$k == best_k]

sprintf("Best k: %d, Best MSE: %.3f", best_k, rmse_knn_all)
## [1] "Best k: 30, Best MSE: 1.292"

When looking at all numeric predictors, the best k for RMSE is 30, however, we see that at k = 9, we also have quite a low RMSE as well.

KNN Couple Variables v1

set.seed(1234)
train_control <- trainControl(method = "cv", number = 5)
knn_v1_model <- train(
  Lgth.of.Sty ~ Age + Inf.Risk + R.Cul.Rat + R.CX.ray.Rat + N.Beds + Med.Sc.Aff,
  data = cleaned_obj_2,
  method = "knn",
  trControl = train_control,
  tuneGrid = expand.grid(k = c(1:10, 20, 30))
)

print(knn_v1_model)
## k-Nearest Neighbors 
## 
## 111 samples
##   6 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 88, 90, 88, 89, 89 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    1  1.607190  0.1527092  1.273367
##    2  1.415132  0.1777442  1.142162
##    3  1.355112  0.2226904  1.101688
##    4  1.343746  0.2298662  1.096834
##    5  1.314924  0.2434483  1.081474
##    6  1.303935  0.2417774  1.078696
##    7  1.284000  0.2547728  1.061646
##    8  1.274059  0.2704031  1.045664
##    9  1.279255  0.2676272  1.035398
##   10  1.267711  0.2791885  1.018944
##   20  1.307496  0.2181723  1.050198
##   30  1.288398  0.2367415  1.022785
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 10.
KNN Tune Plot
plot(knn_v1_model)

KNN Best Tune
knn_results <- knn_v1_model$results
best_k <- knn_v1_model$bestTune$k
rmse_knn_v1 <- knn_results$RMSE[knn_results$k == best_k]

sprintf("Best k: %d, Best MSE: %.3f", best_k, rmse_knn_v1)
## [1] "Best k: 10, Best MSE: 1.268"

Similar to looking at all predictors, for the predictors our LASSO model chose, we see again that at k = 30, our RMSE is lowest, however, at k = 9 we have a low RMSE as well.

KNN Couple Variables v2

set.seed(1234)
train_control <- trainControl(method = "cv", number = 5)
knn_v2_model <- train(
  Lgth.of.Sty ~ Age + Inf.Risk + Region + Med.Sc.Aff,
  data = cleaned_obj_2,
  method = "knn",
  trControl = train_control,
  tuneGrid = expand.grid(k = c(1:10, 20, 30))
)

print(knn_v2_model)
## k-Nearest Neighbors 
## 
## 111 samples
##   4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 88, 90, 88, 89, 89 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    1  1.794866  0.1190193  1.396178
##    2  1.435927  0.2136443  1.130902
##    3  1.366410  0.2279952  1.085376
##    4  1.360840  0.2363762  1.091787
##    5  1.325933  0.2258781  1.056191
##    6  1.266148  0.2922350  1.015717
##    7  1.296658  0.2620468  1.048769
##    8  1.336213  0.1995070  1.087715
##    9  1.327809  0.2070321  1.080879
##   10  1.344680  0.1797312  1.084815
##   20  1.338769  0.2516622  1.077299
##   30  1.398054  0.1439504  1.126062
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 6.
KNN Tune Plot
plot(knn_v2_model)

KNN Best Tune
knn_results <- knn_v2_model$results
best_k <- knn_v2_model$bestTune$k
rmse_knn_v2 <- knn_results$RMSE[knn_results$k == best_k]

sprintf("Best k: %d, Best MSE: %.3f", best_k, rmse_knn_v2)
## [1] "Best k: 6, Best MSE: 1.266"

Interestingly, for the predictors of Age, Inf.Risk, Med.Sc.Aff, and Region, k = 10 has the best RMSE.

Model Comparision

lm_all_predictors <- attr(terms(test_model_fit), "term.labels")
lm_no_outliers_predictors <- attr(terms(test_model_remove_fit), "term.labels")
anova_predictors <- rownames(anova_results)
lasso_predictors <- lasso_table$Feature
knn_all_predictors <- attr(knn_all_model$terms, "term.labels")
knn_v1_predictors <- attr(knn_v1_model$terms, "term.labels")
knn_v2_predictors <- attr(knn_v2_model$terms, "term.labels")

model_comparison <- tibble(
  Model = c(
    "Linear Model (All Predictors)",
    "Linear Model (No Outliers)",
    "ANOVA",
    "LASSO",
    "k-NN (All Predictors)",
    "k-NN (Version 1)",
    "k-NN (Version 2)"
  ),
  RMSE = c(
    rmse_lm_before,
    rmse_lm_after,
    rmse_anova,
    rmse_lasso,
    rmse_knn_all,
    rmse_knn_v1,
    rmse_knn_v2
  ) |> round(3),
  Features = c(
    str_c(lm_all_predictors, collapse = " + "),
    str_c(lm_no_outliers_predictors, collapse = " + "),
    str_c(anova_predictors[anova_predictors != "Residuals"], collapse = " + "),
    str_c(lasso_predictors, collapse = " + "),
    str_c(knn_all_predictors, collapse = " + "),
    str_c(knn_v1_predictors, collapse = " + "),
    str_c(knn_v2_predictors, collapse = " + ")
  )
) |> arrange(RMSE)

datatable(model_comparison, options = list(
  dom = "t",
  autoWidth = TRUE,
  columnDefs = list(
    list(width = "120px", targets = 1, className = "dt-center"),
    list(targets = 2, render = JS(
      "function(data, type, row) {
        return type === 'display' && data.length > 50 ?
          '<span style=\"white-space:normal;\">' + data + '</span>' :
          data;
      }"
    ))
  )
), rownames = FALSE)